<!DOCTYPE html>
<html lang="zh-Hans">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.3.0">
  <link rel="apple-touch-icon" sizes="180x180" href="/blog/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/blog/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/blog/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/blog/images/logo.svg" color="#222">

<link rel="stylesheet" href="/blog/css/main.css">

<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic&display=swap&subset=latin,latin-ext">
<link rel="stylesheet" href="/blog/lib/font-awesome/css/font-awesome.min.css">


<script id="hexo-configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    hostname: new URL('https://guodh.gitee.io/blog').hostname,
    root: '/blog/',
    scheme: 'Pisces',
    version: '7.7.1',
    exturl: false,
    sidebar: {"position":"left","display":"post","padding":18,"offset":12,"onmobile":false},
    copycode: {"enable":true,"show_result":true,"style":null},
    back2top: {"enable":true,"sidebar":false,"scrollpercent":false},
    bookmark: {"enable":false,"color":"#222","save":"auto"},
    fancybox: false,
    mediumzoom: false,
    lazyload: false,
    pangu: false,
    comments: {"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},
    algolia: {
      appID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    },
    localsearch: {"enable":false,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},
    path: 'search.xml',
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}}
  };
</script>

  <meta name="description" content="现在企业都是面对巨大的数据量，那么作为一个服务端工程师，势必要了解最新的大数据工具。Hadoop，Storm，Sprak……大数据提供的工具也是眼花缭乱。所以，本文从网络上搜集了些大数据相关知识，便于让大家更加清晰的理解每个工具的不同之处。 涉及点:Hadoop，Storm，Samza，Spark，Flink">
<meta property="og:type" content="article">
<meta property="og:title" content="大数据工具概览">
<meta property="og:url" content="https://guodh.gitee.io/blog/2017/12/17/2017/%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%B7%A5%E5%85%B7%E6%A6%82%E8%A7%88/index.html">
<meta property="og:site_name" content="蛋蛋的小屋">
<meta property="og:description" content="现在企业都是面对巨大的数据量，那么作为一个服务端工程师，势必要了解最新的大数据工具。Hadoop，Storm，Sprak……大数据提供的工具也是眼花缭乱。所以，本文从网络上搜集了些大数据相关知识，便于让大家更加清晰的理解每个工具的不同之处。 涉及点:Hadoop，Storm，Samza，Spark，Flink">
<meta property="og:locale">
<meta property="article:published_time" content="2017-12-17T04:03:51.000Z">
<meta property="article:modified_time" content="2018-01-17T15:07:13.000Z">
<meta property="article:author" content="RunningEgg">
<meta property="article:tag" content="大数据">
<meta name="twitter:card" content="summary">

<link rel="canonical" href="https://guodh.gitee.io/blog/2017/12/17/2017/%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%B7%A5%E5%85%B7%E6%A6%82%E8%A7%88/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome: false,
    isPost: true
  };
</script>

  <title>大数据工具概览 | 蛋蛋的小屋</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-meta">

    <div>
      <a href="/blog/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">蛋蛋的小屋</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
        <p class="site-subtitle">来了就看看吧，没有刀剑只有故事</p>
  </div>

  <div class="site-nav-toggle">
    <div class="toggle" aria-label="Toggle navigation bar">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>
</div>


<nav class="site-nav">
  
  <ul id="menu" class="menu">
        <li class="menu-item menu-item-home">

    <a href="/blog/" rel="section"><i class="fa fa-fw fa-home"></i>Home</a>

  </li>
        <li class="menu-item menu-item-about">

    <a href="/blog/about/" rel="section"><i class="fa fa-fw fa-user"></i>About</a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/blog/archives/" rel="section"><i class="fa fa-fw fa-archive"></i>Archives</a>

  </li>
        <li class="menu-item menu-item-book">

    <a href="/blog/book/" rel="section"><i class="fa fa-fw fa-archive"></i>book</a>

  </li>
  </ul>

</nav>
</div>
    </header>

    
  <div class="back-to-top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content">
            

  <div class="posts-expand">
      
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block " lang="zh-Hans">
    <link itemprop="mainEntityOfPage" href="https://guodh.gitee.io/blog/2017/12/17/2017/%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%B7%A5%E5%85%B7%E6%A6%82%E8%A7%88/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/blog/images/avatar.gif">
      <meta itemprop="name" content="RunningEgg">
      <meta itemprop="description" content="写下我的点点滴滴，写下我对这个世界的看法，还有我内心锁崇拜的技术">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="蛋蛋的小屋">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          大数据工具概览
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              <span class="post-meta-item-text">Posted on</span>

              <time title="Created: 2017-12-17 12:03:51" itemprop="dateCreated datePublished" datetime="2017-12-17T12:03:51+08:00">2017-12-17</time>
            </span>
              <span class="post-meta-item">
                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                <span class="post-meta-item-text">Edited on</span>
                <time title="Modified: 2018-01-17 23:07:13" itemprop="dateModified" datetime="2018-01-17T23:07:13+08:00">2018-01-17</time>
              </span>

          

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <p>现在企业都是面对巨大的数据量，那么作为一个服务端工程师，势必要了解最新的大数据工具。Hadoop，Storm，Sprak……大数据提供的工具也是眼花缭乱。所以，本文从网络上搜集了些大数据相关知识，便于让大家更加清晰的理解每个工具的不同之处。</p>
<p>涉及点:Hadoop，Storm，Samza，Spark，Flink</p>
<a id="more"></a>

<blockquote>
<p><a href="http://www.infoq.com/cn/articles/hadoop-storm-samza-spark-flink">大数据框架对比：Hadoop、Storm、Samza、Spark和Flink</a></p>
</blockquote>
<h2 id="Hadoop"><a href="#Hadoop" class="headerlink" title="Hadoop"></a>Hadoop</h2><p>Hadoop是最早被我们熟知的大数据处理框架，其实可以称之为“批处理”框架，因为每一次他都是读取一批数据进行分析，然后进行reduce()操作。<br>缺点：每次结果都要存到File上，导致速度比较慢。<br>典型应用：处理报表。</p>
<h2 id="Storm"><a href="#Storm" class="headerlink" title="Storm"></a>Storm</h2><p>完全区别于Hadoop，Storm是流式处理，什么意思呢？就是Storm是一条一条处理数据，能够保证极低的延迟。学术点讲，就是拓扑结构。<br>缺点：无法进行批处理，顺序不一定对。<br>优点：极低的延迟，而且可接入的语言多（包括非JVM语言）。</p>
<h2 id="Samza"><a href="#Samza" class="headerlink" title="Samza"></a>Samza</h2><p>Samza和Storm类似，但是和Kafka紧密绑定，依赖于Kafka的Topic，broker……</p>
<h2 id="Spark"><a href="#Spark" class="headerlink" title="Spark"></a>Spark</h2><p>spark其实类似于Hadoop的Map-reduce，然而不同的是，spark在hadoop的概念上，提出了RDD(Resilient Distributed Datasets)的概念。将数据的转化，流转全都放在了RDD这个模型上面。同时，基于这个设计模型，利用spark自身带有的任务调度器来生成DAG（有向无环图）。<br>另外有个很明显的特色是，惰性求值。</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">JavaRDD&lt;String&gt; inputRDD = sc.textFile(<span class="string">&quot;/xxx.text&quot;</span>);</span><br></pre></td></tr></table></figure>
<p>像上面这样的语句，在我们印象中，都是立刻运行。然而在spark中只有在依赖inputRDD的时候才会去读取，并且读取的操作也可能是多次读取。</p>
<p>同时，Spark也支持“流式处理”。但Spark的流式处理，其实是一个微批次处理，同样基于RDD的模型。将间隔一定时间的数据输入到RDD中，进行DAG处理，达到流式处理的结果。当然这个时效性和Storm的肯定有差距。</p>
<h2 id="Flink"><a href="#Flink" class="headerlink" title="Flink"></a>Flink</h2><p>Flink虽然是混合类型，但是和Spark不同，他是在流式处理的模式上进行了扩展。它的应用场景和Storm更加的类似，具体可以看这个。所以Flink针对的同样是需要低延迟处理的场景。</p>
<blockquote>
<p><a href="https://tech.meituan.com/Flink_Benchmark.html">流计算框架 Flink 与 Storm 的性能对比</a><br>综合上述测试结果，以下实时计算场景建议考虑使用 Flink 框架进行计算：<br>要求消息投递语义为 Exactly Once 的场景；（数据不会重复的情况）<br>数据量较大，要求高吞吐低延迟的场景；<br>需要进行状态管理或窗口统计的场景。（例如：5分钟多少人点赞）</p>
</blockquote>

    </div>

    
    
    

      <footer class="post-footer">
          <div class="post-tags">
              <a href="/blog/tags/%E5%A4%A7%E6%95%B0%E6%8D%AE/" rel="tag"># 大数据</a>
          </div>

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/blog/2017/12/03/2017/Spring%E7%9A%84IOC%E3%80%81AOP%E6%BA%90%E7%A0%81%E8%A7%A3%E6%9E%90/" rel="prev" title="Spring的IOC、AOP源码解析">
      <i class="fa fa-chevron-left"></i> Spring的IOC、AOP源码解析
    </a></div>
      <div class="post-nav-item">
    <a href="/blog/2018/01/07/2018/LSM-Tree%E4%BB%8B%E7%BB%8D/" rel="next" title="LSM树的不同实现介绍">
      LSM树的不同实现介绍 <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  

  </div>


          </div>
          

<script>
  window.addEventListener('tabs:register', () => {
    let activeClass = CONFIG.comments.activeClass;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          Table of Contents
        </li>
        <li class="sidebar-nav-overview">
          Overview
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#Hadoop"><span class="nav-number">1.</span> <span class="nav-text">Hadoop</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Storm"><span class="nav-number">2.</span> <span class="nav-text">Storm</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Samza"><span class="nav-number">3.</span> <span class="nav-text">Samza</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Spark"><span class="nav-number">4.</span> <span class="nav-text">Spark</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Flink"><span class="nav-number">5.</span> <span class="nav-text">Flink</span></a></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
  <p class="site-author-name" itemprop="name">RunningEgg</p>
  <div class="site-description" itemprop="description">写下我的点点滴滴，写下我对这个世界的看法，还有我内心锁崇拜的技术</div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/blog/archives/">
        
          <span class="site-state-item-count">89</span>
          <span class="site-state-item-name">posts</span>
        </a>
      </div>
      <div class="site-state-item site-state-tags">
        <span class="site-state-item-count">24</span>
        <span class="site-state-item-name">tags</span>
      </div>
  </nav>
</div>
  <div class="links-of-author motion-element">
      <span class="links-of-author-item">
        <a href="mailto:guodh_1@163.com" title="E-Mail → mailto:guodh_1@163.com" rel="noopener" target="_blank"><i class="fa fa-fw fa-envelope"></i>E-Mail</a>
      </span>
  </div>



      </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

<div class="copyright">
  
  &copy; 
  <span itemprop="copyrightYear">2022</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">RunningEgg</span>
</div>
  <div class="powered-by">Powered by <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> v5.3.0
  </div>
  <span class="post-meta-divider">|</span>
  <div class="theme-info">Theme – <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> v7.7.1
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="/blog/lib/anime.min.js"></script>
  <script src="/blog/lib/velocity/velocity.min.js"></script>
  <script src="/blog/lib/velocity/velocity.ui.min.js"></script>

<script src="/blog/js/utils.js"></script>

<script src="/blog/js/motion.js"></script>


<script src="/blog/js/schemes/pisces.js"></script>


<script src="/blog/js/next-boot.js"></script>




  















  

  

</body>
</html>
